{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Shallow-Learning Topic Modelling"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In the following we will show you how to create a topic model, using a shallow-learning approach. Here we will use the results and the embeddings obtained from the document-document projection of the bipartite graph.\n",
    "\n",
    "**NOTE: This Notebook can only be run after the 01_nlp_graph_creation notebook, as some of the results computed in the first notebook will be here reused.** "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus = pd.read_pickle(\"corpus.p\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "topics = Counter([label for document_labels in corpus[\"label\"] for label in document_labels]).most_common(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('earn', 3964),\n",
       " ('acq', 2369),\n",
       " ('money-fx', 717),\n",
       " ('grain', 582),\n",
       " ('crude', 578),\n",
       " ('trade', 485),\n",
       " ('interest', 478),\n",
       " ('ship', 286),\n",
       " ('wheat', 283),\n",
       " ('corn', 237)]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "topics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "topicsList = [topic[0] for topic in topics]\n",
    "topicsSet = set(topicsList)\n",
    "dataset = corpus[corpus[\"label\"].apply(lambda x: len(topicsSet.intersection(x))>0)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a class to \"simulate\" the training of the embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator\n",
    "\n",
    "class EmbeddingsTransformer(BaseEstimator):\n",
    "    \n",
    "    def __init__(self, embeddings_file):\n",
    "        self.embeddings_file = embeddings_file\n",
    "        \n",
    "    def fit(self, *args, **kwargs):\n",
    "        self.embeddings = pd.read_pickle(self.embeddings_file)\n",
    "        return self\n",
    "        \n",
    "    def transform(self, X):\n",
    "        return self.embeddings.loc[X.index]\n",
    "    \n",
    "    def fit_transform(self, X, y):\n",
    "        return self.fit().transform(X)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob \n",
    "files = glob(\"./embeddings/*\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "graphEmbeddings = EmbeddingsTransformer(files[0]).fit()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train/Test split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_labels(corpus, topicsList=topicsList):\n",
    "    return corpus[\"label\"].apply(\n",
    "        lambda labels: pd.Series({label: 1 for label in labels}).reindex(topicsList).fillna(0)\n",
    "    )[topicsList]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_features(corpus):\n",
    "    return corpus[\"parsed\"] #graphEmbeddings.transform(corpus[\"parsed\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_features_and_labels(corpus):\n",
    "    return get_features(corpus), get_labels(corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_test_split(corpus):\n",
    "    graphIndex = [index for index in corpus.index if index in graphEmbeddings.embeddings.index]\n",
    "    \n",
    "    train_idx = [idx for idx in graphIndex if \"training/\" in idx]\n",
    "    test_idx = [idx for idx in graphIndex if \"test/\" in idx]\n",
    "    return corpus.loc[train_idx], corpus.loc[test_idx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "train, test = train_test_split(dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Build the model and cross-validation "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.ensemble import RandomForestClassifier \n",
    "from sklearn.multioutput import MultiOutputClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = MultiOutputClassifier(RandomForestClassifier())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline = Pipeline([\n",
    "    (\"embeddings\", graphEmbeddings),\n",
    "    (\"model\", model)\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import RandomizedSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./bipartiteGraphEmbeddings_20_10.p',\n",
       " './bipartiteGraphEmbeddings_10.p',\n",
       " './bipartiteGraphEmbeddings_20_30.p',\n",
       " './bipartiteGraphEmbeddings_20.p',\n",
       " './bipartiteGraphEmbeddings_20_20.p',\n",
       " './bipartiteGraphEmbeddings_30.p']"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 237,
   "metadata": {},
   "outputs": [],
   "source": [
    "param_grid = {\n",
    "    \"embeddings__embeddings_file\": files,\n",
    "    \"model__estimator__n_estimators\": [50, 100], \n",
    "    \"model__estimator__max_features\": [0.2,0.3, \"auto\"], \n",
    "    #\"model__estimator__max_depth\": [3, 5]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 255,
   "metadata": {},
   "outputs": [],
   "source": [
    "features, labels = get_features_and_labels(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 256,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import f1_score "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 257,
   "metadata": {},
   "outputs": [],
   "source": [
    "grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1, \n",
    "                           scoring=lambda y_true, y_pred: f1_score(y_true, y_pred,average='weighted'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 258,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/deusebio/.pyenv/versions/3.7.6/envs/ml-book-7/lib/python3.7/site-packages/sklearn/model_selection/_search.py:921: UserWarning: One or more of the test scores are non-finite: [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan\n",
      " nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]\n",
      "  category=UserWarning\n"
     ]
    }
   ],
   "source": [
    "model = grid_search.fit(features, labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 259,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5,\n",
       "             estimator=Pipeline(steps=[('embeddings',\n",
       "                                        EmbeddingsTransformer(embeddings_file='bipartiteGraphEmbeddings_20.p')),\n",
       "                                       ('model',\n",
       "                                        MultiOutputClassifier(estimator=RandomForestClassifier(class_weight='balanced')))]),\n",
       "             n_jobs=-1,\n",
       "             param_grid={'embeddings__embeddings_file': ['./bipartiteGraphEmbeddings_20_10.p',\n",
       "                                                         './bipartiteGraphEmbeddings_10.p',\n",
       "                                                         './bipartiteGraphEmbeddings_20_30.p',\n",
       "                                                         './bipartiteGraphEmbeddings_20.p',\n",
       "                                                         './bipartiteGraphEmbeddings_20_20.p',\n",
       "                                                         './bipartiteGraphEmbeddings_30.p'],\n",
       "                         'model__estimator__max_features': [0.2, 0.3, 'auto'],\n",
       "                         'model__estimator__n_estimators': [50, 100]},\n",
       "             scoring=<function <lambda> at 0x14af7ee60>)"
      ]
     },
     "execution_count": 259,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 260,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'embeddings__embeddings_file': './bipartiteGraphEmbeddings_20_10.p',\n",
       " 'model__estimator__max_features': 0.2,\n",
       " 'model__estimator__n_estimators': 50}"
      ]
     },
     "execution_count": 260,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.best_params_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Evaluate performance "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 261,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_predictions(model, features):\n",
    "    return pd.DataFrame(\n",
    "        model.predict(features), \n",
    "        columns=topicsList, \n",
    "        index=features.index\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 262,
   "metadata": {},
   "outputs": [],
   "source": [
    "preds = get_predictions(model, get_features(test))\n",
    "labels = get_labels(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 263,
   "metadata": {},
   "outputs": [],
   "source": [
    "errors = 1 - (labels - preds).abs().sum().sum() / labels.abs().sum().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 264,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6702547542160029"
      ]
     },
     "execution_count": 264,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "errors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 265,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import classification_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 266,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.97      0.94      0.95      1087\n",
      "           1       0.93      0.74      0.83       719\n",
      "           2       0.79      0.45      0.57       179\n",
      "           3       0.96      0.64      0.77       149\n",
      "           4       0.95      0.59      0.73       189\n",
      "           5       0.95      0.45      0.61       117\n",
      "           6       0.87      0.41      0.56       131\n",
      "           7       0.83      0.21      0.34        89\n",
      "           8       0.69      0.34      0.45        71\n",
      "           9       0.61      0.25      0.35        56\n",
      "\n",
      "   micro avg       0.94      0.72      0.81      2787\n",
      "   macro avg       0.85      0.50      0.62      2787\n",
      "weighted avg       0.92      0.72      0.79      2787\n",
      " samples avg       0.76      0.75      0.75      2787\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/deusebio/.pyenv/versions/3.7.6/envs/ml-book-7/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(labels, preds))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "ml-book-7",
   "language": "python",
   "name": "ml-book-7"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
